# Goal: Create Tables A1, A3 & A4
# by Jennifer Pan and Yiqing Xu

################################################
# Table A1: Means for Question in Samples 1 & 2
################################################

# Inputs in working dir: sample1.dta, sample2.dta, questions.csv (or quesitons.csv)
# Also required in the R session: two numeric vectors of length 63 named `stab` and `pred`

suppressPackageStartupMessages({
  library(haven)
  library(dplyr)
  library(purrr)
  library(readr)
  library(tidyr)
  library(tibble)
  library(stringr)
})

# 1) Question lists (63 items; six dimensions)
question.list <- list(
  paste0("s1_", 1:14),  # nationalism
  paste0("s2_", 1:14),  # political liberalism
  paste0("s3_", 1:14),  # market economy (pro-market)
  paste0("s4_", 1:7),   # traditionalism
  paste0("s5_", 1:7),   # social equality
  paste0("s6_", 1:7)    # minority accommodation
)

sign.list <- list(
  c(1, -1, -1, 1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1), # nationalism *(flip below)*
  c(-1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, 1),  # political
  c(-1, 1, -1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, -1),  # market
  c(1, -1, -1, -1, 1, 1, 1),                           # traditionalism *(flip below)*
  c(-1, 1, -1, 1, 1, 1, -1),                           # equality
  c(1, -1, 1, -1, -1, 1, -1)                           # ethnic
)

# Flip signs for nationalism (dim 1) and traditionalism (dim 4) per the Note
sign.list[[1]] <- -sign.list[[1]]
sign.list[[4]] <- -sign.list[[4]]

# 2) Load data
q <- read_dta("data/sample1.dta")
s <- read_dta("data/sample2.dta")

# questions
qs_words <- read_csv("data/questions.csv", show_col_types = FALSE) |>
  transmute(eng = as.character(eng), chn = as.character(chn))

# 3) Item map
dims <- c("nationalism","political","market","traditionalism","equality","ethnic")
item_map <- imap_dfr(
  question.list,
  ~ tibble(var = .x, dim = dims[.y], item_no = seq_along(.x))
) |>
  group_by(dim) |>
  mutate(order_within = row_number()) |>
  ungroup() |>
  mutate(
    sign = unlist(sign.list),
    var_t = str_replace(var, "^s", "t")  # paired retest variable in sample 2
  )

# 4) Sample summaries (means and NA%)
summarize_items <- function(df, vars) {
  purrr::map_dfr(vars, function(v) {
    x <- df[[v]]
    tibble(
      var   = v,
      mean  = mean(as.numeric(x), na.rm = TRUE),
      na_pct = mean(is.na(x)) * 100
    )
  })
}
sum_q <- summarize_items(q, item_map$var) |> rename(mean_q = mean, na_q = na_pct)
sum_s <- summarize_items(s, item_map$var) |> rename(mean_s = mean, na_s = na_pct)

# 5) Stability: |cor(sX_Y, tX_Y)| within sample2.dta
stab_tbl <- map_dfr(1:nrow(item_map), function(i) {
  v1 <- item_map$var[i]
  v2 <- item_map$var_t[i]
  r <- if (all(c(v1, v2) %in% names(s))) {
    suppressWarnings(abs(cor(as.numeric(s[[v1]]), as.numeric(s[[v2]]), use = "pairwise.complete.obs")))
  } else NA_real_
  tibble(var = v1, stability = r)
})

# 6) Predictability: |cor(sX_Y, index2_dim)| in wave 2
dim_to_index2 <- c(
  nationalism    = "index2_nati",
  political      = "index2_poli",
  market         = "index2_econ",
  traditionalism = "index2_trad",
  equality       = "index2_equi",
  ethnic         = "index2_ethn"
)

pred_tbl <- map_dfr(1:nrow(item_map), function(i) {
  v  <- item_map$var[i]
  id <- dim_to_index2[[ item_map$dim[i] ]]
  r <- if (!is.null(id) && all(c(v, id) %in% names(s))) {
    suppressWarnings(abs(cor(as.numeric(s[[v]]), as.numeric(s[[id]]), use = "pairwise.complete.obs")))
  } else NA_real_
  tibble(var = v, predictability = r)
})

# 7) Assemble
tbl <- item_map |>
  select(-var_t) |>
  left_join(sum_q,  by = "var") |>
  left_join(sum_s,  by = "var") |>
  left_join(stab_tbl, by = "var") |>
  left_join(pred_tbl, by = "var")

# 8) Attach bilingual wordings (must be 63 rows in nationalism→…→ethnic order)
stopifnot(nrow(qs_words) == nrow(item_map))
tbl <- bind_cols(tbl, qs_words)

# 9) Reorder for publication: political, market, nationalism, then others
order_levels <- c("political","market","nationalism","traditionalism","equality","ethnic")
tbl <- tbl |>
  mutate(dim = factor(dim, levels = order_levels)) |>
  arrange(dim, order_within) |>
  group_by(dim) |>
  mutate(No = row_number()) |>
  ungroup() |>
  select(
    dim, No,
    eng, chn,
    sign,
    mean_q, na_q,
    mean_s, na_s,
    stability, predictability,
    var
  ) |>
  rename(
    Question_ENG   = eng,
    Question_CHN   = chn,
    Sign           = sign,
    Mean_Sample1   = mean_q,
    `NA%_Sample1`  = na_q,
    Mean_Sample2   = mean_s,
    `NA%_Sample2`  = na_s,
    Stability      = stability,
    Predictability = predictability,
    Variable       = var
  ) |>
  mutate(
    Sign = as.integer(Sign),
    across(c(Mean_Sample1, Mean_Sample2), ~round(., 2)),
    across(c(`NA%_Sample1`, `NA%_Sample2`), ~round(., 2)),
    Stability = round(Stability, 2),
    Predictability = round(Predictability, 2)
  )

# 10) Write CSV
write_csv(tbl, "tables/question_summ.csv")



################################################
# Table A3: Summary Statistics 
################################################

## packages
library(haven)
library(dplyr)
library(tidyr)

## helper: N (non-missing), mean, sd for a set of vars in a (sub)sample
summarize_vars <- function(df, vars) {
  tibble(
    variable = vars,
    Obs  = sapply(vars, \(v) sum(!is.na(df[[v]]))),
    Mean = sapply(vars, \(v) mean(df[[v]], na.rm = TRUE)),
    SD   = sapply(vars, \(v) sd(df[[v]],   na.rm = TRUE))
  )
}


##############
## Sample 1
##############

## variables to summarize
vars <- c("female","age","hschool","jcollege","college","eduyr",
          "han","urbanhukou","ccp","married","inc_low","inc_high","coastal")

### 1) load data
dat <- read_dta("data/sample1.dta")

### 2) Wave 1: full sample 
wave1 <- summarize_vars(dat, vars) %>% 
  dplyr::rename(Obs_W1 = Obs, Mean_W1 = Mean, SD_W1 = SD)

### 3) Wave 2
wave2 <- dat %>% filter(wave2 == 1)
wave2 <- summarize_vars(wave2, vars) %>%
  dplyr::rename(Obs_W2 = Obs, Mean_W2 = Mean, SD_W2 = SD)

### 4) human-readable row labels
labels <- tibble(
  variable = vars,
  Label = c("Female","Age","High school","3-year college",
            "4-year college or above","Years of education","Ethnic Han",
            "Urban Hukou","CCP member","Married",
            "Monthly income below 3000 RMB","Monthly income above 8000 RMB",
            "Coastal provinces")
)

### 5) combine and format
out <- labels %>%
  left_join(wave1, by = "variable") %>%
  left_join(wave2, by = "variable") %>%
  select(Label, Obs_W1, Mean_W1, SD_W1, Obs_W2, Mean_W2, SD_W2) %>%
  mutate(
    across(c(Mean_W1, SD_W1, Mean_W2, SD_W2), ~round(.x, 2)),
    across(c(Obs_W1, Obs_W2), as.integer)
  )
print(out)

# 6) write CSV corresponding to the displayed table
write.csv(out, "tables/summary1.csv", row.names = FALSE)

##############
## Sample 2
##############

## variables to summarize
vars <- c("female","age","hschool","jcollege","college","eduyr",
          "han","urbanhukou","ccp","married","inc_low","inc_high","coastal")

### 1) load data
dat <- read_dta("data/sample2.dta")

### 2) Wave 1: full sample 
wave1 <- summarize_vars(dat, vars) %>% 
  dplyr::rename(Obs_W1 = Obs, Mean_W1 = Mean, SD_W1 = SD)

### 3) Wave 2
wave2 <- dat %>% filter(wave2 == 1)
wave2 <- summarize_vars(wave2, vars) %>%
  dplyr::rename(Obs_W2 = Obs, Mean_W2 = Mean, SD_W2 = SD)

### 4) human-readable row labels
labels <- tibble(
  variable = vars,
  Label = c("Female","Age","High school","3-year college",
            "4-year college or above","Years of education","Ethnic Han",
            "Urban Hukou","CCP member","Married",
            "Monthly income below 3000 RMB","Monthly income above 8000 RMB",
            "Coastal provinces")
)

### 5) combine and format
out <- labels %>%
  left_join(wave1, by = "variable") %>%
  left_join(wave2, by = "variable") %>%
  select(Label, Obs_W1, Mean_W1, SD_W1, Obs_W2, Mean_W2, SD_W2) %>%
  mutate(
    across(c(Mean_W1, SD_W1, Mean_W2, SD_W2), ~round(.x, 2)),
    across(c(Obs_W1, Obs_W2), as.integer)
  )
print(out)

# 6) write CSV corresponding to the displayed table
write.csv(out, "tables/summary2.csv", row.names = FALSE)



########################################################
# Table A4: Self-identification on a Left–Right Scale
########################################################

library(haven)
library(dplyr)
library(readr)
library(tidyr)

s <- read_dta("data/sample2.dta")
tab <- s |>
  mutate(ideology_self = ifelse(is.na(ideology_self), 8, ideology_self)) |>
  count(ideology_self, name = "Count") |>
  mutate(
    Category = c("Extreme Left", "Left", "Left Leaning", "Centrist",
                 "Right Leaning", "Right", "Extreme Right", "Don't Know")[ideology_self],
    Percentage = round(Count / sum(Count) * 100, 1)
  ) |>
  select(Category, Count, Percentage)

tab[nrow(tab) + 1, ] <- list("Total", sum(tab$Count), 100)

# Transpose so that Count and Percentage are rows
tab_wide <- tab |>
  pivot_longer(cols = c(Count, Percentage), names_to = "Measure", values_to = "Value") |>
  pivot_wider(names_from = Category, values_from = Value)

write_csv(tab_wide, "tables/self_id.csv")

